Setup

knitr::opts_chunk$set(echo = TRUE)

source("RaceID2_StemID_class.R")

## install required packages (only at first time)
install.packages(c("tsne","pheatmap","MASS","cluster","mclust","flexmix","lattice","fpc","RColorBrewer","permute","amap","locfit","vegan"), repos = "http://cran.us.r-project.org")
## Error in install.packages : Updating loaded packages
## input data
x <- read.csv("D:/Documents/SCHOOL/VU/2017-2018 Master Year 2/Project/Seperate Scripts/lymphnode-sc-transcriptomics/data/3 - combinedcounts/LNS_W_ALL.csv",sep=",",header=TRUE)
rownames(x) <- x$GENEID

# prdata: data.frame with transcript counts for all genes (rows) in all cells (columns); with rownames == gene ids; remove ERCC spike-ins 
prdata <- x[grep("ERCC",rownames(x),invert=TRUE),-1]

RaceID Script

## RaceID2
# initialize SCseq object with transcript counts
sc <- SCseq(prdata)
# filtering of expression data
sc <- filterdata(sc, mintotal=1500, minexpr=5, minnumber=1, maxexpr=500, downsample=FALSE, dsn=1, rseed=17000)
# k-medoids clustering
sc <- clustexp(sc,clustnr=30,bootnr=50,metric="pearson",do.gap=FALSE,sat=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000,FUNcluster="kmedoids")
## Clustering k = 1,2,..., K.max (= 30): .. done
## boot 1 
## boot 2 
## boot 3 
## boot 4 
## boot 5 
## boot 6 
## boot 7 
## boot 8 
## boot 9 
## boot 10 
## boot 11 
## boot 12 
## boot 13 
## boot 14 
## boot 15 
## boot 16 
## boot 17 
## boot 18 
## boot 19 
## boot 20 
## boot 21 
## boot 22 
## boot 23 
## boot 24 
## boot 25 
## boot 26 
## boot 27 
## boot 28 
## boot 29 
## boot 30 
## boot 31 
## boot 32 
## boot 33 
## boot 34 
## boot 35 
## boot 36 
## boot 37 
## boot 38 
## boot 39 
## boot 40 
## boot 41 
## boot 42 
## boot 43 
## boot 44 
## boot 45 
## boot 46 
## boot 47 
## boot 48 
## boot 49 
## boot 50
# compute t-SNE map
sc <- comptsne(sc,rseed=15555)
## sigma summary: Min. : 0.0895715709120785 |1st Qu. : 0.126044344689149 |Median : 0.140730395072168 |Mean : 0.147191667902368 |3rd Qu. : 0.160472081150269 |Max. : 0.315411608654121 |
## Epoch: Iteration #100 error is: 1.49199896725629
## Epoch: Iteration #200 error is: 1.38758770536514
## Epoch: Iteration #300 error is: 1.3292023285217
## Epoch: Iteration #400 error is: 1.29767623172042
## Epoch: Iteration #500 error is: 1.28299126515621
## Epoch: Iteration #600 error is: 1.27481716272011
## Epoch: Iteration #700 error is: 1.27009746271147
## Epoch: Iteration #800 error is: 1.26698531272644
## Epoch: Iteration #900 error is: 1.26492125272787
## Epoch: Iteration #1000 error is: 1.26354699253662
# detect outliers and redefine clusters
sc <- findoutliers(sc, outminc=5,outlg=2,probthr=1e-3,thr=2**-(1:40),outdistquant=.95)
##  [1] 5.000000e-01 2.500000e-01 1.250000e-01 6.250000e-02 3.125000e-02
##  [6] 1.562500e-02 7.812500e-03 3.906250e-03 1.953125e-03 9.765625e-04
## [11] 4.882812e-04 2.441406e-04 1.220703e-04 6.103516e-05 3.051758e-05
## [16] 1.525879e-05 7.629395e-06 3.814697e-06 1.907349e-06 9.536743e-07
## [21] 4.768372e-07 2.384186e-07 1.192093e-07 5.960464e-08 2.980232e-08
## [26] 1.490116e-08 7.450581e-09 3.725290e-09 1.862645e-09 9.313226e-10
## [31] 4.656613e-10 2.328306e-10 1.164153e-10 5.820766e-11 2.910383e-11
## [36] 1.455192e-11 7.275958e-12 3.637979e-12 1.818989e-12 9.094947e-13

RaceID Diagnostic Plots

## diagnostic plots
# gap statistics: only if do.gap == TRUE
##plotgap(sc)
# plot within-cluster dispersion as a function of the cluster number: only if sat == TRUE
plotsaturation(sc,disp=TRUE)

# plot change of the within-cluster dispersion as a function of the cluster number: only if sat == TRUE
plotsaturation(sc)

# silhouette of k-medoids clusters
plotsilhouette(sc)

# Jaccard's similarity of k-medoids clusters
plotjaccard(sc)

# barchart of outlier probabilities
plotoutlierprobs(sc)

# regression of background model
plotbackground(sc)

# dependence of outlier number on probability threshold (probthr)
plotsensitivity(sc)

# heatmap of k-medoids cluster
clustheatmap(sc,final=FALSE,hmethod="single")

##  [1]  2  1  9  8 11  3  7  5 12 10  4  6
# heatmap of final cluster
clustheatmap(sc,final=TRUE,hmethod="single")

##  [1] 44 41 47 39 37 23 42 26 36 18 48 49 40 33 21 25 28 35 17 30 38 27 31
## [24] 13 29 24  4 32 20 46  8 43 11 12 16  3  1  7 14  6 10 34  9 19 22  5
## [47] 15  2 45
# highlight k-medoids clusters in t-SNE map
plottsne(sc,final=FALSE)

# highlight final clusters in t-SNE map
plottsne(sc,final=TRUE)

# highlight cell labels in t-SNE map
plotlabelstsne(sc,labels=sub("(\\_\\d+)","",names(sc@ndata)))

# highlight groups of cells by symbols in t-SNE map
plotsymbolstsne(sc,types=sub("(\\_\\d+)$","", names(sc@ndata)))

# highlight transcirpt counts of a set of genes in t-SNE map, e. g. all Apoa genes
#g <- c("Apoa1__chr9", "Apoa1bp__chr3", "Apoa2__chr1", "Apoa4__chr9", "Apoa5__chr9")
#plotexptsne(sc,g,n="Apoa genes",logsc=TRUE)

RaceID Results

## identification of marker genes
# differentially regulated genes in each cluster compared to the full ensemble
cdiff <- clustdiffgenes(sc,pvalue=.01)

setwd("D:/Documents/SCHOOL/VU/2017-2018 Master Year 2/Project/Seperate Scripts/lymphnode-sc-transcriptomics/data/4 - raceidstemid/All")

## write results to text files
# final clusters 
x <- data.frame(CELLID=names(sc@cpart),cluster=sc@cpart)
write.table(x[order(x$cluster,decreasing=FALSE),],"cell_clust.xls",row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
  
# differentially expressed genes in cluster
for ( n in names(cdiff) ) write.table(data.frame(GENEID=rownames(cdiff[[n]]),cdiff[[n]]),paste(paste("cell_clust_diff_genes",sub("\\.","\\_",n),sep="_"),".xls",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)

# differentially expressed genes between two sets of clusters, e. g. cluster 1 and clusters 2,3
d <- diffgenes(sc,cl1=1,cl2=c(2,3),mincount=5)
plotdiffgenes(d,gene=names(d$z)[1])

StemID Script

## StemID

# initialization
ltr <- Ltree(sc)
# computation of the entropy
ltr <- compentropy(ltr)
# computation of the projections for all cells
ltr <- projcells(ltr,cthr=2,nmode=FALSE)
# computation of the projections for all cells after randomization
ltr <- projback(ltr,pdishuf=100,nmode=FALSE,rseed=17000)                  # NOTE: TEMPORARILY CHANGED THIS TO 100 FOR TESTING PURPOSES, TAKES 10min, WAS 2000!
## pdishuffle: 1 
## pdishuffle: 2 
## pdishuffle: 3 
## pdishuffle: 4 
## pdishuffle: 5 
## pdishuffle: 6 
## pdishuffle: 7 
## pdishuffle: 8 
## pdishuffle: 9 
## pdishuffle: 10 
## pdishuffle: 11 
## pdishuffle: 12 
## pdishuffle: 13 
## pdishuffle: 14 
## pdishuffle: 15 
## pdishuffle: 16 
## pdishuffle: 17 
## pdishuffle: 18 
## pdishuffle: 19 
## pdishuffle: 20 
## pdishuffle: 21 
## pdishuffle: 22 
## pdishuffle: 23 
## pdishuffle: 24 
## pdishuffle: 25 
## pdishuffle: 26 
## pdishuffle: 27 
## pdishuffle: 28 
## pdishuffle: 29 
## pdishuffle: 30 
## pdishuffle: 31 
## pdishuffle: 32 
## pdishuffle: 33 
## pdishuffle: 34 
## pdishuffle: 35 
## pdishuffle: 36 
## pdishuffle: 37 
## pdishuffle: 38 
## pdishuffle: 39 
## pdishuffle: 40 
## pdishuffle: 41 
## pdishuffle: 42 
## pdishuffle: 43 
## pdishuffle: 44 
## pdishuffle: 45 
## pdishuffle: 46 
## pdishuffle: 47 
## pdishuffle: 48 
## pdishuffle: 49 
## pdishuffle: 50 
## pdishuffle: 51 
## pdishuffle: 52 
## pdishuffle: 53 
## pdishuffle: 54 
## pdishuffle: 55 
## pdishuffle: 56 
## pdishuffle: 57 
## pdishuffle: 58 
## pdishuffle: 59 
## pdishuffle: 60 
## pdishuffle: 61 
## pdishuffle: 62 
## pdishuffle: 63 
## pdishuffle: 64 
## pdishuffle: 65 
## pdishuffle: 66 
## pdishuffle: 67 
## pdishuffle: 68 
## pdishuffle: 69 
## pdishuffle: 70 
## pdishuffle: 71 
## pdishuffle: 72 
## pdishuffle: 73 
## pdishuffle: 74 
## pdishuffle: 75 
## pdishuffle: 76 
## pdishuffle: 77 
## pdishuffle: 78 
## pdishuffle: 79 
## pdishuffle: 80 
## pdishuffle: 81 
## pdishuffle: 82 
## pdishuffle: 83 
## pdishuffle: 84 
## pdishuffle: 85 
## pdishuffle: 86 
## pdishuffle: 87 
## pdishuffle: 88 
## pdishuffle: 89 
## pdishuffle: 90 
## pdishuffle: 91 
## pdishuffle: 92 
## pdishuffle: 93 
## pdishuffle: 94 
## pdishuffle: 95 
## pdishuffle: 96 
## pdishuffle: 97 
## pdishuffle: 98 
## pdishuffle: 99 
## pdishuffle: 100
# assembly of the lineage tree
ltr <- lineagetree(ltr,pthr=0.01,nmode=FALSE)
# determination of significant differentiation trajectories
ltr <- comppvalue(ltr,pethr=0.01,nmode=FALSE)

StemID Diagnostic Plots

## diagnostic plots
# histogram of ratio between cell-to-cell distances in the embedded and the input space
plotdistanceratio(ltr)

# t-SNE map of the clusters with more than cthr cells including a minimum spanning tree for the cluster medoids
plotmap(ltr)

# visualization of the projections in t-SNE space overlayed with a minimum spanning tree connecting the cluster medoids
plotmapprojections(ltr)

# lineage tree showing the projections of all cells in t-SNE space
plottree(ltr,showCells=TRUE,nmode=FALSE,scthr=0)

# lineage tree without showing the projections of all cells
plottree(ltr,showCells=FALSE,nmode=FALSE,scthr=0)

# heatmap of the enrichment p-values for all inter-cluster links
plotlinkpv(ltr)

# heatmap of the link score for all inter-cluster links
plotlinkscore(ltr)

# heatmap showing the fold enrichment (or depletion) for significantly enriched or depleted links
projenrichment(ltr)

StemID Results

## extract projections onto all links for all cells in a given cluster i
x <- getproj(ltr,i=1)
# heatmap of all projections for cluster i
pheatmap(x$pr)

# heatmap of z-score for all projections for cluster i
pheatmap(x$prz)

## extracting all cells on two branches sharing the same cluster and computing differentially expressed genes between these two branches
x <- branchcells(ltr,list("1.3","1.2"))
# z-scores for differentially expressed genes
head(x$diffgenes$z)
##   mt-Co3     Rps6  Gm28037   Eef1a1  Gm29055    Rpl28 
## 8.535748 7.046961 6.069958 5.783058 5.607424 5.133549
# plotting the cells on the two branches as additional clusters in the t-SNE map
plottsne(x$scl)

## computing the StemID score
x <- compscore(ltr,nn=1)
#plotting the StemID score
plotscore(ltr,1)